library(ggplot2)
library(ggpubr)
library(CDM)
library(boot)
library(tidyverse)
library(dummy)
library(stringi)
library(stringr)
rm(list = ls())

x_pre <- read_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\OUTPUT.csv")
Parsed with column specification:
cols(
  .default = col_character(),
  SubjectID = col_double(),
  `Auto Score 1` = col_double(),
  `Auto Score 2` = col_double(),
  `Auto Score 3` = col_double(),
  `Auto Score 4` = col_double(),
  `Auto Score 5` = col_double(),
  `Auto Score 6` = col_double(),
  `Auto Score 7` = col_double(),
  `Auto Score 8` = col_double(),
  `Auto Score 9` = col_double(),
  `Auto Score 10` = col_double(),
  `Auto Score 11` = col_double(),
  `Auto Score 12` = col_double(),
  `Auto Score 13` = col_double(),
  `Auto Score 14` = col_double(),
  `Auto Score 15` = col_double(),
  `Auto Score 16` = col_double(),
  `Auto Score 17` = col_double(),
  `Auto Score 18` = col_double(),
  `Auto Score 19` = col_double()
  # ... with 34 more columns
)
See spec(...) for full column specifications.

Q_from_book <- read_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\final_result_similar.csv") %>% mutate(`Learning Objective` = `APA Learning Objective`)
Parsed with column specification:
cols(
  Question = col_character(),
  Option1 = col_character(),
  Option2 = col_character(),
  Option3 = col_character(),
  Option4 = col_character(),
  Answer = col_character(),
  `Learning Objective` = col_character(),
  Topic = col_character(),
  `Difficulty Level` = col_character(),
  `Skill Level` = col_character(),
  `APA Learning Objective` = col_character()
)
Q_from_book <- Q_from_book %>% 
  mutate(`Learning Objective` = str_trim(str_remove_all(`Learning Objective`, "\\."))) %>% 
  filter(`Learning Objective` != "nan")
  

glimpse(Q_from_book)
Observations: 992
Variables: 11
$ Question                 <chr> "Which of the following is an example of social influence?", "Which of the following is an example of a d...
$ Option1                  <chr> "a. You feel guilty because you lied to your trusting professor about your assignment.", "a. A bully thre...
$ Option2                  <chr> "b. When you get hungry, you have trouble concentrating.", "b. Ramona works hard in school to make her mo...
$ Option3                  <chr> "c. You didn\u0092t do well on the test because you stayed up all night cramming.", "c. Marianne thinks o...
$ Option4                  <chr> "d. You almost fall asleep at the wheel, so you pull off the road to take a short nap.", "d. Jason moves ...
$ Answer                   <chr> "A", "A", "D", "C", "A", "C", "C", "B", "D", "D", "C", "B", "C", "A", "B", "C", "D", "D", "D", "C", "A", ...
$ `Learning Objective`     <chr> "11 Describe key concepts, principles, and overarching themes in psychology", "11 Describe key concepts, ...
$ Topic                    <chr> "Defining Social Psychology", "Defining Social Psychology", "Defining Social Psychology", "Defining Socia...
$ `Difficulty Level`       <chr> "Moderate", "Moderate", "Moderate", "Moderate", "Easy", "Moderate", "Moderate", "Moderate", "Moderate", "...
$ `Skill Level`            <chr> "Understand the Concepts", "Understand the Concepts", "Understand the Concepts", "Understand the Concepts...
$ `APA Learning Objective` <chr> "1.1 Describe key concepts, principles, and overarching themes in psychology.", "1.1 Describe key concept...
learning_obj <- Q_from_book %>%
  distinct(`Learning Objective`) %>%
  mutate(lo_id = row_number())


Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>% mutate(temp = str_trim(str_replace_all(Question, "_|\\.", "")))
Joining, by = "Learning Objective"
learning_obj


Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>% 
  mutate(temp = str_trim(str_replace_all(Question, "_|\\.", ""))) %>%
  mutate(Q_UNIQUE_ID = row_number()) 
Joining, by = "Learning Objective"
Q_pre
NA

head(x_pre)
NA

x.gather <-x_pre %>% gather(key = "key", value = "value", -File, -SubjectID)
x.gather 
x.questions <- 
  
  x.gather %>% filter(str_detect(key, "Question")) %>%

  anti_join(
    x.gather %>% filter(str_detect(key, "Question")) %>% 
      group_by(File, SubjectID, value) %>% 
      summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>% 
      filter(cnt > 1) %>% ungroup(),
            by = "value"
    ) # Taking out generic questions (having same question text but different answers)


x.questions.dist <- x.questions %>% distinct(value) %>% drop_na() %>%  
  #mutate(Q_UNIQUE_ID = row_number()) %>% 
  mutate(temp = str_trim(str_replace_all(value, "_|\\.", ""))) %>% 
  
  inner_join(
    Q_pre, by = "temp"
    
    
  )



x.questions.dist %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\Q_distinct_id.csv")
x.questions.dist
Q <- x.questions.dist %>% distinct(Q_UNIQUE_ID, lo_id) %>% arrange(Q_UNIQUE_ID) %>%
  mutate(present = 1) %>%
  
  spread(key = "lo_id", value = "present")

Q %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\Q.csv")
Q
NA

x.answers <- 
  
  x.gather %>% filter(!str_detect(key, "Question"))

x.answers

#Total Questions presented to students 53 Questions are randomly presented to students

x.questions %>% distinct(key)

x.questions.id <- x.questions %>% inner_join(x.questions.dist) #%>% mutate(Q_UNIQUE_ID  = factor(Q_UNIQUE_ID)) 
Joining, by = "value"
x.questions.id

Filter out Generic Questions

Questions with same text but different Answers


x.questions.id.filterd <- x.questions.id %>% 
  anti_join(
    x.questions.id %>% 
      group_by(File, SubjectID, Question) %>% 
      summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>% 
      filter(cnt > 1) %>% ungroup(),
            by = "Question"
    ) %>% select(-lo_id, -`Learning Objective`)


x.questions.id.filterd
NA

We have the correct Questions. Now we need to add marks of answers against the questions.


X.pre <- x.questions.id %>% mutate(id = str_split(key, " ", simplify = TRUE)[,2]) %>% 
  
  inner_join(
    
    x.answers %>% mutate(id = str_split(key, " ", simplify = TRUE)[,3]), by = c("File", "SubjectID", "id")
    
    ) %>% 
  mutate(value.y = as.integer(value.y)) #%>% 
  #mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID))

#write_csv(X.pre, "X_Pre.csv")
X.pre
unique(X.pre$Q_UNIQUE_ID)
  [1]  19 182 246 299 118  91 114  88  82   9 259 105  78  13 260  49  17 148 273 288  40 166  66 202 225 177 126 204 140 293 116 136 186 223
 [35] 149 229 122 281 137  64 291  67  33  30 271 504 966 480 972 477 981 493 496 460 522 465 403 376 441 354 529 514 474 430 439 446 348 416
 [69] 323 356 473 501 986 515 371 331 497 380 505 368 343 520 345 467 369 980 483 412 418 604 955 600 590 562 612 549 605 591 610 557 935 939
[103] 558 659 927 904 566 657 556 924 918 645 627 553 577 571 639 617 646 912 954 957 554 814 730 811 863 807 859 830 758 837 663 813 842 783
[137] 835 725 872 819 853 708 894 692 757 881 876 867 764 777 737 762 713 785 849 752 747 711 664 662 665 720 661 689 666 902 990 669 693 823
[171] 828 793 812 227 232  57  56 119 261 127 100 196 213  93  70 306  46 277 224  85  69  47 198  90 183  98 503 513 379 387 325 317 987 339
[205] 443 413 432 454 461 404 492 377 410 378 326 357 456 337 468 426 322 350 415 360 414 424 509 440 445 962 458 407 355 495 530 336 486 976
[239] 647 546 920 561 916 910 611 540 632 614 582 555 544 906 536 908 937 923 919 946 631 922 642 603 623 953 653 951 943 681 832 861 715 890
[273] 860 897 843 748 818 671 901 705 773 746 687 804 667 728 882 845 903 726 891 827 831 797 761 772 821 883 724 803 739 800 714 743 840 749
[307] 888 668 880 765 889 895 226  79 150 172  22 257 102 208 297 187 155 180  32  65 215  43 129 157 197 128 199 156 113  27 221 267  84 145
[341]   3 255  51 289 176 463 526 367 334 449 525 349 406 969 372 351 346 462 971 312 985 502 989 421 402 429 319 393 408 489 327 315 973 433
[375] 476 375 409 352 428 532 340 965 523 967 622 929 950 534 909 587 959 917 926 597 914 595 570 628 658 578 564 608 635 634 841 731 844 734
[409] 683 672 836 782 877 799 735 866 784 781 802 899 722 733 834 775 846 839 856 808 790 786 706 682 680 869 690 675 740 776 825 810 779 820
[443] 755  16 147 117 276 212 115 300  95 165 254 194  87  50  45 310 144 241 170  15  10 272   7  25 216 131 219 134 106 275  41 298 265 457
[477] 364 475 511 320 373 964 318 398 362 970 388 442 484 974 435 459 333 419 961 968 510 370 400 321 616 625 619 586 933 552 638 567 543 560
[511] 542 649 905 652 606 624 545 598 620 601 596 934 629 898 716 717 679 670 874 780 871 768 732 822 704 685 884 870 887 778 767 789 686 677
[545] 794 688 796 791 684 192 112 161 167  97 304 250 301  38  11 164  81 262  77 143  68 292  63 179 363 472 338 427 353 488 464 498 361 528
[579] 508 479 519 374 485 405 386 621 640 579 930 928 607 574 636 594 585 573 537 938 548 650 915 592 921 742 750 892 691 700 824 833 678 873
[613] 896 858 879 771 141 175 107 101  14 163  89  71 247 222 242 132 162 108 205 158 201 303  18  96 169 251 193 142 171 500 444 455 469 512
[647] 399 527 494 478 487 324 518 396 411 499 655 580 941 947 609 630 948 550 641 911 613 945 940 559 805 710 703 788 865 712 900 829 751 741
[681] 727 745 817 676 673  92 217 188 214 200   2 103  21 206 252 195 235  86  26 109 230 307 436 490 447 391 384 394 507 584 648 644 615 643
[715] 547 583 931 576 660 565 581 958 809 736 847 878 854 848 798 774 699 826 893 721 701  55 234 209  53 266 174 231 236 207 270 302 243  94
[749] 451 420 382 531 431 471 452 517 358 481 626 907 593 633 936 956 589 551 760 868 695 886 769 862 759 190 279 120  24 159 245  74 110  31
[783] 283 111 168  29 309 365 381 316 330 401 383 984 417 506 453 521 538 651 925 539 795 991 756 698 792 729 723 744 864 857 697 806 178 154
[817]  39 284 280 160 238   1   4  28   5   8  42 524 313 491 332 422 342 602 541 838 763 852 816  23  76 138  73 135  37 425 438 975 347 563
[851] 599 575 588 913 770 702 707 801 278 211 258  83 104  44 390 533 654 572 855 766  80 233 263 185 295  12 366 395 389 979 656 885 787  72
[885]  99 286 151 274 305 448 335 942 753 249 152 450 875  36 256 423 637 696 738 282 146  60  61 397 344  75  62 296   6 977 988 392 932 124
[919] 269 308 944  52  48 218 240 125 466 210 470 851 181 121 191 694 139 248  20 287 220


X<- X.pre %>% select(-key.x, -key.y, -value.x, -id, -temp, -lo_id, -`Learning Objective`, -Question ) %>% 
  spread(key = "Q_UNIQUE_ID", value = "value.y")  
  

write_csv(X, "C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X.csv")
X

Let’s run some test to verify X


X %>% select(-File, -SubjectID) %>% summarise_all(sum, na.rm = TRUE)
NA

X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID)
NA

Filter questions asked in Exam I


library(janitor)
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA

Questions with good attempt count



question_attempted <- X %>% remove_empty(.,which = "cols") %>% 
  gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% 
  group_by(File, QuestionID) %>%
  summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na)

question_attempted <- question_attempted %>% filter(total_attempted >= 8)

question_attempted

#%>% filter(QuestionID == "103")

Filtering out questions with lesser attempts


X_filtered <- X %>% remove_empty(.,which = "cols") %>% 
  gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% semi_join(question_attempted, by = c("File", "QuestionID")) %>% 
  spread(key = "QuestionID", value = "Scores")

X_filtered

Take away questions answered less that 5 times per exam

X %>% remove_empty(.,which = "cols") %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X.csv")

X_filtered %>% remove_empty(.,which = "cols") %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X_filtered.csv")

Write CSVs seperate for each trial to avoid having columns for those questions that were not asked in a trial. This will help to show the true picture of sparsity.


fn.clean <- function (df) {
  return(df %>% remove_empty(.,which = "cols"))
  
}


X.individual.list <- X %>% 
nest(-File, .key = "X_full") %>% 
  mutate(X = map(X_full, fn.clean), 
         Q_full = map(X_full, function(df) return (Q)))

X.individual.list
# A tibble: 8 x 4
  File        X_full              X                   Q_full            
  <chr>       <list>              <list>              <list>            
1 Exam1Trial1 <tibble [74 x 940]> <tibble [74 x 286]> <tibble [939 x 5]>
2 Exam1Trial2 <tibble [57 x 940]> <tibble [57 x 277]> <tibble [939 x 5]>
3 Exam2Trial1 <tibble [66 x 940]> <tibble [66 x 236]> <tibble [939 x 5]>
4 Exam2Trial2 <tibble [67 x 940]> <tibble [67 x 237]> <tibble [939 x 5]>
5 Exam3Trial1 <tibble [47 x 940]> <tibble [47 x 178]> <tibble [939 x 5]>
6 Exam3Trial2 <tibble [78 x 940]> <tibble [78 x 179]> <tibble [939 x 5]>
7 Exam4Trial1 <tibble [64 x 940]> <tibble [64 x 239]> <tibble [939 x 5]>
8 Exam4Trial2 <tibble [72 x 940]> <tibble [72 x 239]> <tibble [939 x 5]>

X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA

Merge with Q


Q
NA

fn.skills <- function (df) {
  
  df <- df %>% remove_empty(.,which = "cols") %>%
  gather(key = "Q_UNIQUE_ID", value = "Score", -SubjectID) %>%
  mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
  
  inner_join(
    Q
    
  ) %>% remove_empty(.,which = "cols")  %>% mutate_all(function(x) ifelse(is.na(x), 0, x))
  
  return(df)
  
}


X.Q <- X.individual.list %>% 
  mutate(Q = map(X, fn.skills))
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
X.Q 
# A tibble: 8 x 5
  File        X_full              X                   Q_full             Q                 
  <chr>       <list>              <list>              <list>             <list>            
1 Exam1Trial1 <tibble [74 x 940]> <tibble [74 x 286]> <tibble [939 x 5]> <tibble [285 x 5]>
2 Exam1Trial2 <tibble [57 x 940]> <tibble [57 x 277]> <tibble [939 x 5]> <tibble [276 x 5]>
3 Exam2Trial1 <tibble [66 x 940]> <tibble [66 x 236]> <tibble [939 x 5]> <tibble [235 x 2]>
4 Exam2Trial2 <tibble [67 x 940]> <tibble [67 x 237]> <tibble [939 x 5]> <tibble [236 x 2]>
5 Exam3Trial1 <tibble [47 x 940]> <tibble [47 x 178]> <tibble [939 x 5]> <tibble [177 x 2]>
6 Exam3Trial2 <tibble [78 x 940]> <tibble [78 x 179]> <tibble [939 x 5]> <tibble [178 x 2]>
7 Exam4Trial1 <tibble [64 x 940]> <tibble [64 x 239]> <tibble [939 x 5]> <tibble [238 x 2]>
8 Exam4Trial2 <tibble [72 x 940]> <tibble [72 x 239]> <tibble [939 x 5]> <tibble [238 x 2]>
X %>% filter(File == "Exam2Trial2") %>% remove_empty(.,which = "cols") %>%
  gather(key = "Q_UNIQUE_ID", value = "Score", -File, -SubjectID) %>%
  mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%

  inner_join(
    Q, by = "Q_UNIQUE_ID"
    
  ) %>% remove_empty(.,which = "cols") %>% mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% summarise_all(sum)
# A tibble: 1 x 2
  Q_UNIQUE_ID   `1`
        <int> <dbl>
1      113706   236
X %>% filter(File == "Exam1Trial1")
NA

fn.write <- function(File, X_full, X, Q_full, Q) {

  
  print(X)
  X %>% write_csv(paste0("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\",File,"_X.csv"))
  Q %>% write_csv(paste0("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\",File,"_Q.csv"))
  
}

#walk2(X.Q$File, X.Q$data_clean, X.Q$data_Q_skills, fn.write)

pwalk(X.Q, fn.write)
NA
NA
---
title: "R Notebook"
output: html_notebook
---



```{r}

library(ggplot2)
library(ggpubr)
library(CDM)
library(boot)
library(tidyverse)
library(dummy)
library(stringi)
library(stringr)


```


```{r}
rm(list = ls())

x_pre <- read_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\OUTPUT.csv")


```

```{r}

Q_from_book <- read_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\final_result_similar.csv") %>% mutate(`Learning Objective` = `APA Learning Objective`)


Q_from_book <- Q_from_book %>% 
  mutate(`Learning Objective` = str_trim(str_remove_all(`Learning Objective`, "\\."))) %>% 
  filter(`Learning Objective` != "nan")
  

glimpse(Q_from_book)

```


```{r}
learning_obj <- Q_from_book %>%
  distinct(`Learning Objective`) %>%
  mutate(lo_id = row_number())


Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>% mutate(temp = str_trim(str_replace_all(Question, "_|\\.", "")))


learning_obj
```

```{r}


Q_pre <- Q_from_book %>% inner_join(learning_obj) %>% select(Question, `Learning Objective`, lo_id) %>% 
  mutate(temp = str_trim(str_replace_all(Question, "_|\\.", ""))) %>%
  mutate(Q_UNIQUE_ID = row_number()) 
Q_pre

```



```{r}

head(x_pre)

```

```{r}

x.gather <-x_pre %>% gather(key = "key", value = "value", -File, -SubjectID)
x.gather 
```



```{r}
x.questions <- 
  
  x.gather %>% filter(str_detect(key, "Question")) %>%

  anti_join(
    x.gather %>% filter(str_detect(key, "Question")) %>% 
      group_by(File, SubjectID, value) %>% 
      summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>% 
      filter(cnt > 1) %>% ungroup(),
            by = "value"
    ) # Taking out generic questions (having same question text but different answers)


x.questions.dist <- x.questions %>% distinct(value) %>% drop_na() %>%  
  #mutate(Q_UNIQUE_ID = row_number()) %>% 
  mutate(temp = str_trim(str_replace_all(value, "_|\\.", ""))) %>% 
  
  inner_join(
    Q_pre, by = "temp"
    
    
  )



x.questions.dist %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\Q_distinct_id.csv")
x.questions.dist
```


```{r}
Q <- x.questions.dist %>% distinct(Q_UNIQUE_ID, lo_id) %>% arrange(Q_UNIQUE_ID) %>%
  mutate(present = 1) %>%
  
  spread(key = "lo_id", value = "present")

Q %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\Q.csv")
Q

```

```{r}

x.answers <- 
  
  x.gather %>% filter(!str_detect(key, "Question"))

x.answers
```

#Total Questions presented to students
53 Questions are randomly presented to students
```{r}
x.questions %>% distinct(key)
```


```{r}

x.questions.id <- x.questions %>% inner_join(x.questions.dist) #%>% mutate(Q_UNIQUE_ID  = factor(Q_UNIQUE_ID)) 

x.questions.id
```

# Filter out Generic Questions 
Questions with same text but different Answers
```{r}

x.questions.id.filterd <- x.questions.id %>% 
  anti_join(
    x.questions.id %>% 
      group_by(File, SubjectID, Question) %>% 
      summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>% 
      filter(cnt > 1) %>% ungroup(),
            by = "Question"
    ) %>% select(-lo_id, -`Learning Objective`)


x.questions.id.filterd

```


We have the correct Questions. Now we need to add marks of answers against the questions.
```{r}

X.pre <- x.questions.id %>% mutate(id = str_split(key, " ", simplify = TRUE)[,2]) %>% 
  
  inner_join(
    
    x.answers %>% mutate(id = str_split(key, " ", simplify = TRUE)[,3]), by = c("File", "SubjectID", "id")
    
    ) %>% 
  mutate(value.y = as.integer(value.y)) #%>% 
  #mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID))

#write_csv(X.pre, "X_Pre.csv")
X.pre
```

```{r}
unique(X.pre$Q_UNIQUE_ID)
```


```{r}


X<- X.pre %>% select(-key.x, -key.y, -value.x, -id, -temp, -lo_id, -`Learning Objective`, -Question ) %>% 
  spread(key = "Q_UNIQUE_ID", value = "value.y")  
  

write_csv(X, "C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X.csv")
X
```

Let's run some test to verify X
```{r}

X %>% select(-File, -SubjectID) %>% summarise_all(sum, na.rm = TRUE)

```

```{r}

X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID)

```

# Filter questions asked in Exam I

```{r}

library(janitor)
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")

```




# Questions with good attempt count
```{r}


question_attempted <- X %>% remove_empty(.,which = "cols") %>% 
  gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% 
  group_by(File, QuestionID) %>%
  summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na)

question_attempted <- question_attempted %>% filter(total_attempted >= 8)

question_attempted

#%>% filter(QuestionID == "103")

```

Filtering out questions with lesser attempts

```{r}

X_filtered <- X %>% remove_empty(.,which = "cols") %>% 
  gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% semi_join(question_attempted, by = c("File", "QuestionID")) %>% 
  spread(key = "QuestionID", value = "Scores")

X_filtered
```



# Take away questions answered less that 5 times per exam
```{r}
X %>% remove_empty(.,which = "cols") %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X.csv")

X_filtered %>% remove_empty(.,which = "cols") %>% write_csv("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\X_filtered.csv")
```

Write CSVs seperate for each trial to avoid having columns for those questions that were not asked in a trial. This will help to show the true picture of sparsity. 

```{r paged.print=FALSE}

fn.clean <- function (df) {
  return(df %>% remove_empty(.,which = "cols"))
  
}


X.individual.list <- X %>% 
nest(-File, .key = "X_full") %>% 
  mutate(X = map(X_full, fn.clean), 
         Q_full = map(X_full, function(df) return (Q)))

X.individual.list



```



```{r}

X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")

```

# Merge with Q

```{r}

Q

```

```{r  paged.print=FALSE}

fn.skills <- function (df) {
  
  df <- df %>% remove_empty(.,which = "cols") %>%
  gather(key = "Q_UNIQUE_ID", value = "Score", -SubjectID) %>%
  mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
  
  inner_join(
    Q
    
  ) %>% remove_empty(.,which = "cols")  %>% mutate_all(function(x) ifelse(is.na(x), 0, x))
  
  return(df)
  
}


X.Q <- X.individual.list %>% 
  mutate(Q = map(X, fn.skills))


X.Q 

X %>% filter(File == "Exam2Trial2") %>% remove_empty(.,which = "cols") %>%
  gather(key = "Q_UNIQUE_ID", value = "Score", -File, -SubjectID) %>%
  mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%

  inner_join(
    Q, by = "Q_UNIQUE_ID"
    
  ) %>% remove_empty(.,which = "cols") %>% mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% summarise_all(sum)


```

```{r}
X %>% filter(File == "Exam1Trial1")

```


```{r }

fn.write <- function(File, X_full, X, Q_full, Q) {

  
  print(X)
  X %>% write_csv(paste0("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\",File,"_X.csv"))
  Q %>% write_csv(paste0("C:\\Users\\rkm160630\\OneDrive - The University of Texas at Dallas\\Ritesh Folder\\PhD\\FirstYearProject\\data\\",File,"_Q.csv"))
  
}

#walk2(X.Q$File, X.Q$data_clean, X.Q$data_Q_skills, fn.write)

pwalk(X.Q, fn.write)


```


